import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
df = pickle.load (open("pj_df_full.20190629_095112.pkl", "rb")) # cases corpus
dfmo = pickle.load (open("posp/pj_demo_dfmo_full.20190629_163240.pkl", "rb")) # political speeches dataset
dfre = pickle.load(open("re/pj_dfre_full.20190629_115819.pkl", "rb")) # reuters dataset
dftw = pickle.load(open("tw/pj_dftw_full.20190630_005652.pkl", "rb")) # twitter dataset
plt.hist(df["antpast"], bins = 50, alpha = 0.5, label = "cases ", range = (0.01,0.9), color = 'g', density = 1)
plt.hist(dfmo["antpast"], bins = 50, alpha = 0.5, label = "pol speeches ", range = (0.01,0.9), color = 'b', density = 1)
plt.hist(dfre["antpast"], bins = 50, alpha = 0.5, label = "reuters ", range = (0.01,0.9), color = 'r', density = 1)
plt.hist(dftw["antpast"], bins = 50, alpha = 0.5, label = "twitter ", range = (0.01,0.9), color = 'violet', density = 1)
plt.title("POS tagging, fraction of verbs in past tense ")
plt.legend(loc='upper right')
plt.show()
plt.hist(df["antpresent"], bins = 50, alpha = 0.5, label = "cases ", range = (0.01,0.9), color = 'g', density = 1)
plt.hist(dfmo["antpresent"], bins = 50, alpha = 0.5, label = "pol speeches ", range = (0.01,0.9), color = 'b', density = 1)
plt.hist(dfre["antpresent"], bins = 50, alpha = 0.5, label = "reuters ", range = (0.01,0.9), color = 'r', density = 1)
plt.hist(dftw["antpresent"], bins = 50, alpha = 0.5, label = "twitter ", range = (0.01,0.9), color = 'violet', density = 1)
plt.title("POS tagging, fraction of verbs in present tense ")
plt.legend(loc='upper right')
plt.show()
plt.hist(df["antfuture"], bins = 50, alpha = 0.5, label = "cases ", range = (0.01,0.9), color = 'g', density = 1)
plt.hist(dfmo["antfuture"], bins = 50, alpha = 0.5, label = "pol speeches ", range = (0.01,0.9), color = 'b', density = 1)
plt.hist(dfre["antfuture"], bins = 50, alpha = 0.5, label = "reuters ", range = (0.01,0.9), color = 'r', density = 1)
plt.hist(dftw["antfuture"], bins = 50, alpha = 0.5, label = "twitter ", range = (0.01,0.9), color = 'violet', density = 1)
plt.title("POS tagging, fraction of verbs in future tense ")
plt.legend(loc='upper right')
plt.show()
plt.hist(df["antfpast"], bins = 50, alpha = 0.5, label = "cases ", range = (0.01,0.9), color = 'g', density = 1)
plt.hist(dfmo["antfpast"], bins = 50, alpha = 0.5, label = "pol speeches ", range = (0.01,0.9), color = 'b', density = 1)
plt.hist(dfre["antfpast"], bins = 50, alpha = 0.5, label = "reuters ", range = (0.01,0.9), color = 'r', density = 1)
plt.hist(dftw["antfpast"], bins = 50, alpha = 0.5, label = "twitter ", range = (0.01,0.9), color = 'violet', density = 1)
plt.title("LIWC, fraction of focus past")
plt.legend(loc='upper right')
plt.show()
plt.hist(df["antfpresent"], bins = 50, alpha = 0.5, label = "cases ", range = (0.01,0.9), color = 'g', density = 1)
plt.hist(dfmo["antfpresent"], bins = 50, alpha = 0.5, label = "pol speeches ", range = (0.01,0.9), color = 'b', density = 1)
plt.hist(dfre["antfpresent"], bins = 50, alpha = 0.5, label = "reuters ", range = (0.01,0.9), color = 'r', density = 1)
plt.hist(dftw["antfpresent"], bins = 50, alpha = 0.5, label = "twitter ", range = (0.01,0.9), color = 'violet', density = 1)
plt.title("LIWC, fraction of focus present")
plt.legend(loc='upper right')
plt.show()
plt.hist(df["antffuture"], bins = 50, alpha = 0.5, label = "cases ", range = (0.01,0.9), color = 'g', density = 1)
plt.hist(dfmo["antffuture"], bins = 50, alpha = 0.5, label = "pol speeches ", range = (0.01,0.9), color = 'b', density = 1)
plt.hist(dfre["antffuture"], bins = 50, alpha = 0.5, label = "reuters ", range = (0.01,0.9), color = 'r', density = 1)
plt.hist(dftw["antffuture"], bins = 50, alpha = 0.5, label = "twitter ", range = (0.01,0.9), color = 'violet', density = 1)
plt.title("LIWC, fraction of focus future")
plt.legend(loc='upper right')
plt.show()
plt.hist(df["antpast"], 50, density=1, facecolor = 'blue', alpha = 0.75, label='past tense')
plt.hist(df["antpresent"], 50, density=1, facecolor = 'orange', alpha = 0.75, label = 'present tense')
plt.hist(df["antfuture"], 50, density=1, facecolor = 'green', alpha = 0.75, label = 'future tense')
plt.title("POS tagging, all cases in the cases corpus")
plt.legend(loc='best')
plt.show()
plt.hist(df["antfpast"], 50, density=1, facecolor = 'blue', alpha = 0.75, label='past focus')
plt.hist(df["antfpresent"], 50, density=1, facecolor = 'orange', alpha = 0.75, label='present focus')
plt.hist(df["antffuture"], 50, density=1, facecolor = 'green', alpha = 0.75, label='future focus')
plt.title("LIWC, all cases in the cases corpus")
plt.legend(loc='best')
plt.show()
spas = df["npast"].sum() # spacy
sprs = df["npresent"].sum()
sfus = df["nfuture"].sum()
sAvepast, sAvepresent, sAvezfuture = [spas, sprs, sfus]/(spas + sprs + sfus)
lpas = df["nfpast"].sum() # LIWC
lprs = df["nfpresent"].sum()
lfus = df["nffuture"].sum()
lAvepast, lAvepresent, lAvezfuture = [lpas, lprs, lfus]/(lpas + lprs + lfus)
We can use this test, if we observe two independent samples from the same or different population. The test measures whether the average (expected) value differs significantly across samples. If we observe a large p-value, for example larger than 0.05 or 0.1, then we cannot reject the null hypothesis of identical average. If the p-value is smaller than the threshold, e.g. 1% , 5% or 10%, then we reject the null hypothesis of equal averages. The two samples does not need to have the same length.
Tests the null hypothesis that the categorical data has the given frequencies. (f_obs, f_exp, ddof = 0, axis = 0) The p-value is computed using a chi-squared distribution with k - 1- ddof (degrees of freedom), where k = the number of observed frequencies.
from scipy.stats import ttest_ind # ttest_ind tests the equality of means
from scipy.stats import chisquare # chisquare tests the independence of 2 distributions
# calculate the proportions of past, present and future tenses averaged across the entire corpus
# see above POS tagging
print(sAvepast, sAvepresent, sAvezfuture)
# chisquared test observed data vs expected (=uniformly distributed) data POS tagging entire cases corpus
#chisquare([100*sAvepast, 100*sAvepresent, 100*sAvefuture], [33, 33, 34])
chisquare([49, 49, 2], [33, 33, 34])
# high test statistic and low p-value indicate that the observed distribution is unequal to the expected, uniform distribution
# hence the use of past, present and future tenses in the case corpus is deliberate
# calculate the proportions of past, present and future focus averaged across the entire corpus
# see above LIWC
print(lAvepast, lAvepresent, lAvezfuture)
# chisquared test observed data vs expected (=uniformly distributed) data LIWC entire cases corpus
chisquare([100*lAvepast, 100*lAvepresent, 100*lAvezfuture], [33, 33, 34])
# high test statistic and low p-value indicate that the observed distribution is unequal to the expected, uniform distribution
# hence the use of past, present and future focus in the case corpus is deliberate
spas = dfmo[ 'npast'].sum()
sprs = dfmo[ 'npresent'].sum()
sfus = dfmo[ 'nfuture'].sum()
sAvepast, sAvepresent, sAvefuture = [spas, sprs, sfus]/(spas + sprs + sfus)
lpas = dfmo[ 'nfpast'].sum()
lprs = dfmo[ 'nfpresent'].sum()
lfus = dfmo[ 'nffuture'].sum()
lAvepast, lAvepresent, lAvefuture = [lpas, lprs, lfus]/(lpas + lprs + lfus)
print("political speeches, both speakers POS ", sAvepast, sAvepresent, sAvefuture)
chisquare([22, 68, 10], [33, 33, 34])
# test result indicates POS past/present/future tenses distribution not random
print("political speeches, both speakers LIWC ", lAvepast, lAvepresent, lAvefuture)
chisquare([20, 67, 13], [33, 33, 34])
# test result indicates LIWC past/present/future focus distribution not random
spa = dfre["npast"].sum()
spr = dfre["npresent"].sum()
sfu = dfre["nfuture"].sum()
sAvepast = spa /(spa + spr + sfu)
sAvepresent = spr / (spa + spr + sfu)
sAvezfuture = sfu / (spa + spr + sfu)
lpa = dfre["nfpast"].sum()
lpr = dfre["nfpresent"].sum()
lfu = dfre["nffuture"].sum()
lAvepast = lpa/ (lpa + lpr + lfu)
lAvepresent = lpr/ (lpa + lpr + lfu)
lAvezfuture = lfu/ (lpa + lpr + lfu)
modfplot = pd.DataFrame({'Avepast' : [lAvepast, sAvepast], 'Avepresent': [lAvepresent, sAvepresent], 'Avezfuture': [lAvezfuture, sAvezfuture]}, index =
['LIWC', 'Spacy'] )
modfplot.plot.bar(rot=0)
plt.show()
print("POS:", sAvepast, sAvepresent, sAvezfuture, " LIWC:", lAvepast, lAvepresent, lAvezfuture)
# LIWC, reuters, past focus, present focus, future focus
chisquare([48, 41, 11], [33, 33, 34])
# large t statistics and small p value indicate the observed distribution is not uniform
# POS tagging, reuters, past tense, present tense, future tense
chisquare([60, 35, 5], [33, 33, 34])
# large t statistics and small p value indicate the observed distribution is not uniform
spas = df["npast"].sum() # spacy
sprs = df["npresent"].sum()
sfus = df["nfuture"].sum()
sAvepast, sAvepresent, sAvezfuture = [spas, sprs, sfus]/(spas + sprs + sfus)
lpas = df["nfpast"].sum() # LIWC
lprs = df["nfpresent"].sum()
lfus = df["nffuture"].sum()
lAvepast, lAvepresent, lAvezfuture = [lpas, lprs, lfus]/(lpas + lprs + lfus)
modfplot = pd.DataFrame({'Avepast' : [lAvepast, sAvepast], 'Avepresent': [lAvepresent, sAvepresent], 'Avezfuture': [lAvezfuture, sAvezfuture]}, index =
['LIWC', 'Spacy'] )
modfplot.plot.bar(rot=0)
plt.legend(loc='upper center')
plt.show()
# POS tagging past, present, future over the years
import matplotlib.pyplot as plt
axpast = df[df["year"] != 0 ].groupby('year')['antpast'].mean().plot(color='red', label='past tense')
axpresent = df[df["year"] != 0].groupby('year')['antpresent'].mean().plot(color='green', label='present tense')
axfuture = df[df["year"] !=0].groupby('year')['antfuture'].mean().plot(color='blue', label='future tense')
plt.legend(loc='best')
plt.show()
pasttense_before1970 = df [df["year"] < 1970]["antpast"]
pasttense_after1970 = df [df["year"] >= 1970]["antpast"]
# indicates means before 1970 statististically significantly differs from after 1970
ttest_ind(pasttense_before1970, pasttense_after1970)
# LIWC past, present, future focus over the years
import matplotlib.pyplot as plt
axpast = df[df["year"] != 0 ].groupby('year')['antfpast'].mean().plot(color='red', label='past focus')
axpresent = df[df["year"] != 0].groupby('year')['antfpresent'].mean().plot(color='green', label='present focus')
axfuture = df[df["year"] !=0].groupby('year')['antffuture'].mean().plot(color='blue', label='future focus')
plt.legend(loc='best')
plt.show()
pastfocus_before1970 = df [df["year"] < 1970]["antfpast"]
pastfocus_after1970 = df [df["year"] >= 1970]["antfpast"]
# indicates means before 1970 statististically significantly differs from after 1970
ttest_ind(pastfocus_before1970, pastfocus_after1970)
# LIWC antfpast, antfpresent, antffuture vs POS tagging antpast, antpresent, antfuture
ttest_ind(df["antfpast"], df["antpast"])
# high t-statistic and low p-value indicate the mean of past tenses (POS) and past focus (LIWC) are not equal
ttest_ind(df["antfpresent"], df["antpresent"])
# high t-statistic and low p-value indicate the mean of present tenses (POS) and present focus (LIWC) are not equal
ttest_ind(df["antffuture"], df["antfuture"])
# high t-statistic and low p-value indicate the mean of future tenses (POS) and future focus (LIWC) are not equal
# POS tagging vs LIWC past both speakers
ttest_ind(dfmo['antpast'], dfmo['antfpast'], equal_var=False, nan_policy='omit')
# result indicates equal means found by POS tagging and LIWC
# POS tagging vs LIWC present both speakers
ttest_ind(dfmo['antpresent'], dfmo['antfpresent'], equal_var=False, nan_policy='omit')
# result indicates equal means found by POS tagging and LIWC
# POS tagging vs LIWC future both speakers
ttest_ind(dfmo['antfuture'], dfmo['antffuture'], equal_var=False, nan_policy='omit')
# result indicates unequal means found by POS tagging and LIWC
spa = dfre["npast"].sum()
spr = dfre["npresent"].sum()
sfu = dfre["nfuture"].sum()
sAvepast = spa /(spa + spr + sfu)
sAvepresent = spr / (spa + spr + sfu)
sAvezfuture = sfu / (spa + spr + sfu)
lpa = dfre["nfpast"].sum()
lpr = dfre["nfpresent"].sum()
lfu = dfre["nffuture"].sum()
lAvepast = lpa/ (lpa + lpr + lfu)
lAvepresent = lpr/ (lpa + lpr + lfu)
lAvezfuture = lfu/ (lpa + lpr + lfu)
modfplot = pd.DataFrame({'Avepast' : [lAvepast, sAvepast], 'Avepresent': [lAvepresent, sAvepresent], 'Avezfuture': [lAvezfuture, sAvezfuture]}, index =
['LIWC', 'Spacy'] )
modfplot.plot.bar(rot=0)
plt.show()
spa = dftw["npast"].sum()
spr = dftw["npresent"].sum()
sfu = dftw["nfuture"].sum()
sAvepast = spa /(spa + spr + sfu)
sAvepresent = spr / (spa + spr + sfu)
sAvezfuture = sfu / (spa + spr + sfu)
lpa = dftw["nfpast"].sum()
lpr = dftw["nfpresent"].sum()
lfu = dftw["nffuture"].sum()
lAvepast = lpa/ (lpa + lpr + lfu)
lAvepresent = lpr/ (lpa + lpr + lfu)
lAvezfuture = lfu/ (lpa + lpr + lfu)
modfplot = pd.DataFrame({'Avepast' : [lAvepast, sAvepast], 'Avepresent': [lAvepresent, sAvepresent], 'Avezfuture': [lAvezfuture, sAvezfuture]}, index =
['LIWC', 'Spacy'] )
modfplot.plot.bar(rot=0)
plt.show()
# POS tagging antpast, antpresent, antfuture
ttest_ind(df["antpast"], df["antpresent"])
# high t-statistic and low p-value indicate the mean of past tenses and present tenses are not equal
ttest_ind(df["antpresent"], df["antfuture"])
# high t-statistic and low p-value indicate the mean of present tenses and future tenses are not equal
# past tense vs present tense both speakers
ttest_ind(dfmo['antpast'], dfmo['antpresent'], equal_var=False, nan_policy='omit')
# results show strongly significant difference between past tense usage and present tense usage
# present tense vs future tense both speakers
ttest_ind(dfmo['antpresent'], dfmo['antfuture'], equal_var=False, nan_policy='omit')
# results show strongly significant difference between present tense usage and future tense usage
ttest_ind(dfre['antpast'], dfre['antpresent'], equal_var = False, nan_policy='omit')
# deliberate use of past, present tenses
ttest_ind(dfre['antpresent'], dfre['antfuture'], equal_var = False, nan_policy='omit')
# deliberate use of present, future tenses
ttest_ind(dftw['antpast'], dftw['antpresent'], equal_var = False, nan_policy='omit')
# deliberate use of past, present tenses
ttest_ind(dftw['antpresent'], dftw['antfuture'], equal_var = False, nan_policy='omit')
# deliberate use of present, future tenses
df_republican = df[ df["Party"] == "Republican"]
df_democratic = df[ df["Party"] == "Democratic"]
# POS tagging past tense, present tense, future tense rep
rpas = df_republican['npast'].sum()
rprs = df_republican['npresent'].sum()
rfus = df_republican ['nfuture'].sum()
rAvepast, rAvepresent, rAvefuture = [rpas, rprs, rfus]/(rpas + rprs + rfus)
# POS tagging past tense, present tense, future tense dem
dpas = df_democratic['npast'].sum()
dprs = df_democratic['npresent'].sum()
dfus = df_democratic['nfuture'].sum()
dAvepast, dAvepresent, dAvefuture = [dpas, dprs, dfus]/(dpas + dprs + dfus)
# POS tagging past tense, present tense, future tense rep vs dem
modfplot = pd.DataFrame({'Ave1past' : [rAvepast, dAvepast], 'Ave2present': [rAvepresent, dAvepresent], 'Avef3future': [rAvefuture, dAvefuture]}, index = ['rep', 'dem'] )
modfplot.plot.bar(rot=0)
plt.show()
# POS tagging past tense rep vs dem
ttest_ind(df_republican["antpast"], df_democratic["antpast"])
# test result indicating use of past tense not significantly different between republican and democratic judges
# POS tagging present tense rep vs dem
ttest_ind(df_republican["antpresent"], df_democratic["antpresent"])
# test result indicating use of present tense not significantly different between republican and democratic judges
# POS tagging future tense rep vs dem
ttest_ind(df_republican["antfuture"], df_democratic["antfuture"])
# test result indicating use of future tense not significantly different between republican and democratic judges
# LIWC past focus, present focus, future focus rep vs dem
rfpas = df_republican['nfpast'].sum()
rfprs = df_republican['nfpresent'].sum()
rffus = df_republican ['nffuture'].sum()
rAvefpast, rAvefpresent, rAveffuture = [rfpas, rfprs, rffus]/(rfpas + rfprs + rffus)
dfpas = df_democratic['nfpast'].sum()
dfprs = df_democratic['nfpresent'].sum()
dffus = df_democratic['nffuture'].sum()
dAvefpast, dAvefpresent, dAveffuture = [dfpas, dfprs, dffus]/(dfpas + dfprs + dffus)
modfplot = pd.DataFrame({'Avef1past' : [rAvefpast, dAvefpast], 'Avef2present': [rAvefpresent, dAvefpresent], 'Avef3future': [rAveffuture, dAveffuture]}, index = ['rep', 'dem'] )
modfplot.plot.bar(rot=0)
plt.show()
# LIWC past focus rep vs dem
ttest_ind(df_republican["antfpast"], df_democratic["antfpast"])
# test result indicating use of past focus not significantly different between republican and democratic judges
# LIWC present focus rep vs dem
ttest_ind(df_republican["antfpresent"], df_democratic["antfpresent"])
# test result indicating use of present focus not significantly different between republican and democratic judges
# LIWC future focus rep vs dem
ttest_ind(df_republican["antffuture"], df_democratic["antffuture"])
# test result indicating use of future focus not significantly different between republican and democratic judges
opas = dfmo.loc[ dfmo['speaker'] == 'Obama', 'npast'].sum()
oprs = dfmo.loc[ dfmo['speaker'] == 'Obama', 'npresent'].sum()
ofus = dfmo.loc[ dfmo['speaker'] == 'Obama', 'nfuture'].sum()
oAvepast, oAvepresent, oAvefuture = [opas, oprs, ofus]/(opas + oprs + ofus)
mpas = dfmo.loc[ dfmo['speaker'] == 'McCain', 'npast'].sum()
mprs = dfmo.loc[ dfmo['speaker'] == 'McCain', 'npresent'].sum()
mfus = dfmo.loc[ dfmo['speaker'] == 'McCain', 'nfuture'].sum()
mAvepast, mAvepresent, mAvefuture = [mpas, mprs, mfus]/(mpas + mprs + mfus)
print("POS tagging\nMcCain:", mAvepast, mAvepresent, mAvefuture, "\nObama:", oAvepast, oAvepresent, oAvefuture)
modtplot = pd.DataFrame({'Ave1past':[mAvepast, oAvepast], 'Ave2present': [mAvepresent, oAvepresent], 'Ave3future':[mAvefuture, oAvefuture]}, index = ['McCain', 'Obama'])
modtplot.plot.bar(rot = 0, sort_columns=False)
plt.show()
dfmo_McCain = dfmo[ dfmo["speaker"] == 'McCain' ]
dfmo_Obama = dfmo[ dfmo["speaker"] == 'Obama' ]
# past tense Obama vs McCain
ttest_ind(dfmo_Obama['antpast'], dfmo_McCain['antpast'], equal_var=False, nan_policy='omit')
# no significant difference
# present tense Obama vs McCain
ttest_ind(dfmo_Obama['antpresent'], dfmo_McCain['antpresent'], equal_var=False, nan_policy='omit')
# no significant difference
# future tense Obama vs McCain
ttest_ind(dfmo_Obama['antfuture'], dfmo_McCain['antfuture'], equal_var=False, nan_policy='omit')
# no significant difference
dfmo['modal_ratio'] = dfmo['lmodal'] / dfmo['nverbs']
dfmo.loc[ dfmo["speaker"] == 'McCain', 'modal_ratio' ].describe()
dfmo.loc[ dfmo["speaker"] == 'Obama', 'modal_ratio' ].describe()
dfmo_McCain = dfmo[ dfmo["speaker"] == 'McCain' ]
dfmo_Obama = dfmo[ dfmo["speaker"] == 'Obama' ]
# Obama vs McCain usage of modal (would , could, might)
ttest_ind(dfmo_Obama['modal_ratio'], dfmo_McCain['modal_ratio'], equal_var = False, nan_policy='omit')
# test statistics and p value indicate than the mean modal_ratio is significantly different in McCain's political speeches
# vs Obama's speeches
from tp_utils import *
df.loc[df["ldeont"] > 0, :].sort_values("ldeont", ascending = False)
findd(df.loc["X4AKVH", "doc"]) # find deontic futures in one of cases corpus's doc
findd(df.loc["X42IM4", "doc"]) # find deontic futures in one of cases corpus's doc
findd(df.loc["X3IMFV", "doc"]) # find deontic futures in one of cases corpus's doc
plt.hist(df["lmodal"], bins = 50, alpha = 0.5, label = "cases ", range = (1,50), color = 'g', density = 1)
plt.hist(dfmo["lmodal"], bins = 50, alpha = 0.5, label = "pol speeches ", range = (1,50), color = 'b', density = 1)
plt.hist(dfre["lmodal"], bins = 50, alpha = 0.5, label = "reuters ", range = (1,50), color = 'r', density = 1)
plt.hist(dftw["lmodal"], bins = 50, alpha = 0.5, label = "twitter ", range = (1,50), color = 'violet', density = 1)
plt.title("absolute numbers of modal verbs in 4 datasets , restricted to >= 1 modal ")
plt.legend(loc='upper right')
plt.show()
# remember df = pickle.load(open("/hdf = pickle.load (open("pj_df_full.20190629_095112.pkl", "rb")) ome/xhta/Robot/proj/pj_df_full.20190629_095112.pkl", "rb"))
import pickle
df = pickle.load (open("pj_df_full.20190629_095112.pkl", "rb"))
df2 = df [ (df['Party'] == 'Republican' ) | (df['Party'] == 'Democratic' ) ]
len(df2)
df2.Party.replace(['Republican', 'Democratic'], [1, 0], inplace = True)
df2.head()
from sklearn.model_selection import train_test_split
features = ['antpast', 'antfuture', 'Party', 'nlets', 'lmodal', 'year']
X = df2[features]
X_train, X_test = train_test_split(df2, test_size = 0.3, random_state = 1234)
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg.fit(df2[features], df2['log_cites'])
reg.score(df2[features], df2['log_cites'])
features = ['nfpast', 'nfpresent', 'nfuture', 'Party', 'nlets', 'lmodal', 'year']
reg = linear_model.LinearRegression()
reg.fit(df2[features], df2['log_cites'])
reg.score(df2[features], df2['log_cites'])
from sklearn import linear_model
features = ['nfpast', 'nfpresent', 'nfuture', 'Party', 'nlets', 'lmodal', 'year']
from sklearn.linear_model import LogisticRegression
clflo = LogisticRegression(random_state = 1234, solver = 'liblinear').fit(X_train[features], X_train['case_reversed'])
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
ypreda = clflo.predict(X_train[features])
yprede = clflo.predict(X_test[features])
print(accuracy_score(X_train['case_reversed'], ypreda), accuracy_score(X_test['case_reversed'], yprede))
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
features = [ 'nsents', 'npast', 'nfuture', 'lmodal', 'Party', 'year']
gnb.fit(X_train[features].values, X_train['case_reversed'])
y_pred = gnb.predict(X_test[features])
cm = confusion_matrix(X_test['case_reversed'], y_pred)
tn, fp, fn, tp = cm.ravel()
print ("accuracy:", (tn + tp) / np.sum(cm))
from sklearn import svm
C = 1.0
models = ( svm.SVC(kernel = 'linear' , C = C), svm.LinearSVC (C = C), svm.SVC(kernel = 'rbf', gamma = 'auto', C=C))
models = (clf.fit(X_train[features], X_train['case_reversed']) for clf in models)
cms = []
cms = (confusion_matrix(X_test['case_reversed'], clf.predict(X_test[features])) for clf in models)
for cm in cms:
tn, fp, fn, tp = cm.ravel()
print ("accuracy:", (tn + tp) / np.sum(cm))
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
features = ['antpast', 'antfuture', 'Party', 'nlets', 'lmodal', 'year']
X = df2[features]
y = df2['log_cites']
X_train, X_test = train_test_split(df2, test_size = 0.3, random_state = 1234)
n_folds = 10
kf = KFold(n_splits = n_folds, shuffle=True, random_state = 1234)
cv_results = cross_val_score(SGDClassifier(max_iter = 1000, tol = 1e-03, loss='log', penalty = 'l2'), X_train[features],
X_train['case_reversed'], scoring = 'accuracy', cv=kf)
print ("accuracy mean:", cv_results.mean(), " std:", cv_results.std())
from sklearn.linear_model import LogisticRegressionCV
clf = LogisticRegressionCV(cv = 10, random_state = 1234, multi_class ='ovr').fit(X_train[features], X_train['case_reversed'])
clf.score(X_train[features], X_train['case_reversed'])
from sklearn import clone
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier)
from sklearn.tree import DecisionTreeClassifier
n_estimators = 100
models = [DecisionTreeClassifier(max_depth=None), RandomForestClassifier(n_estimators=n_estimators),
ExtraTreesClassifier(n_estimators=n_estimators), AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=n_estimators)]
#
for model in models:
clf = clone(model)
clf = model.fit(X_train[features], X_train['case_reversed'])
#
y_pred = clf.predict(X_test[features])
cm = confusion_matrix(X_test['case_reversed'], y_pred)
#
tn, fp, fn, tp = cm.ravel()
print ("accuracy:", (tn + tp) / np.sum(cm))
df = pickle.load (open("pj_df_full.20190629_095112.pkl", "rb")) # cases corpus
df2 = df [ (df['Party'] == 'Republican' ) | (df['Party'] == 'Democratic' ) ]
df2.Party.replace(['Republican', 'Democratic'], [1, 0], inplace = True)
from linearmodels.iv import IV2SLS
df2['_const'] = 1
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import time
start_time = time.time()
sid = SentimentIntensityAnalyzer()
for inde in df2.index:
ss = sid.polarity_scores(str(df2.loc[inde, 'doc']))
df2.loc[inde, 'cs'] = ss['compound']
# this step takes quite long to complete
print (time.time() - start_time)
features = ['_const', 'antpast', 'antfuture', 'nlets', 'lmodal', 'year', 'Party', 'cs']
res_ols = IV2SLS(dependent=df2['log_cites'], exog = df2[features], endog = None, instruments = None).fit(cov_type='clustered', clusters=df2['year'])
print(res_ols)
features = ['_const', 'antpast', 'antfuture', 'nlets', 'lmodal', 'year', 'Party']
res_1st = IV2SLS(dependent=df2[ 'cs'], exog = df2[features], endog = None, instruments = None).fit(cov_type='clustered', clusters=df2['year'])
print(res_1st)
features = ['_const', 'antpast', 'antfuture', 'nlets', 'lmodal', 'year']
res_2nd = IV2SLS(dependent=df2['log_cites'], exog = df2[['_const']], endog = df2['cs'], instruments = df2[['antpast', 'antfuture', 'nlets', 'lmodal', 'year', 'Party']]).fit(cov_type='clustered', clusters=df2['year'])
print(res_2nd)
res_2nd.wu_hausman()
# the wu hausman statistic is a variant of the Durbin-Watson test for instrument variables
# (H0 is rejected if test statistic > critical value or p-value < a critical value)
# Davidson, MacKinnon : Estimation and Inference in Ecnometrics Chapter 8
# here H0 can be rejected , hence not all endogenous variables are exogenous
import statsmodels.api as sm
features = ['antpast', 'antfuture', 'Party', 'nlets', 'lmodal', 'year']
X = df2[features]
X.head()
y = df2['log_cites']
X = sm.add_constant(X)
X.head()
sm_model = sm.OLS(y, X).fit()
pred = sm_model.predict(X)
sm_model.summary()
features = ['nfpast', 'nfpresent', 'nfuture', 'Party', 'nlets', 'lmodal', 'year']
X = df2[features]
X = sm.add_constant(X)
sm_model = sm.OLS(y, X).fit()
pred = sm_model.predict(X)
sm_model.summary()
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestRegressor
from skll.metrics import spearman
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
import warnings
RANDOM_STATE=1234
N_JOBS=8
# the modeling pipeline
pipe = Pipeline([("imputer", Imputer()),
("estimator", RandomForestRegressor(random_state=RANDOM_STATE))])
spearman_scorer = make_scorer(spearman)
# the hyperparamters to search over, including different imputation strategies
rf_param_space = {
'imputer__strategy': Categorical(['mean', 'median', 'most_frequent']),
'estimator__max_features': Integer(1, 5), # was Integer(1, 8),
'estimator__n_estimators': Integer(50, 60), # was Integer(50, 500)
'estimator__min_samples_split': Integer(70, 85), # was Integer(2, 200)
}
# create our search object
search = BayesSearchCV(pipe,
rf_param_space,
cv=10,
n_jobs=N_JOBS,
verbose=0,
error_score=-9999,
scoring=spearman_scorer,
random_state=RANDOM_STATE,
return_train_score=True,
n_iter=75)
import pickle
df = pickle.load(open("/home/xhta/Robot/proj/pj_df_full.20190629_095112.pkl", "rb"))
df2 = df [ (df['Party'] == 'Republican' ) | (df['Party'] == 'Democratic' ) ]
df2.Party.replace(['Republican', 'Democratic'], [1, 0], inplace = True)
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(df2, test_size = 0.7, random_state = 1234)
features = ['npast', 'npresent', 'Party', 'nlets', 'lmodal', 'year']
# attention, search can take some time
import time
start_time = time.time()
with warnings.catch_warnings():
warnings.filterwarnings('ignore')
search.fit(X_train[features], X_train['log_cites'])
print (time.time() - start_time)
search.best_params_
# CV score
search.best_score_
# CV standard deviation
search.cv_results_['std_test_score'][search.best_index_]
estimator = search.best_estimator_.named_steps['estimator']
imputer = search.best_estimator_.named_steps['imputer']
estimator.feature_importances_
%matplotlib notebook
import matplotlib.pyplot as plt
import seaborn as sns
X = df2[features]
# get the feature importances from each tree and then visualize the
# distributions as boxplots
all_feat_imp_df = pd.DataFrame(data=[tree.feature_importances_ for tree in
estimator],
columns=list(X.columns))
(sns.boxplot(data=all_feat_imp_df)
.set(title='Feature Importance Distributions',
ylabel='Importance'))
plt.show()
%%time
import xgboost as xgb
xgc = xgb.XGBClassifier(n_estimators=500, max_depth=5, best_score=0.5, objective='binary:logistic', random_state=1234)
xgc.fit(X_train[features], X_train['case_reversed'])
pred = xgc.predict(X_test[features])
print (pred[0:10], X_test['case_reversed'][0:10])
fig = plt.figure(figsize = (16,12))
title = fig.suptitle("Default Feature Importance from XGBoost", fontsize=14)
ax1 = fig.add_subplot(2,2,1)
xgb.plot_importance(xgc, importance_type = 'weight', ax = ax1)
t = ax1.set_title("Feature Importance - Feature Weight")
ax2 = fig.add_subplot(2,2,2)
xgb.plot_importance(xgc, importance_type = 'gain', ax = ax2)
t = ax2.set_title("Feature Importance - Split Mean Gain")
ax3 = fig.add_subplot(2,2,3)
xgb.plot_importance(xgc, importance_type = 'cover', ax = ax3)
t = ax3.set_title("Feature Importance - Sample Coverage")
from skater.core.explanations import Interpretation
from skater.model import InMemoryModel
#Create an interpretation object
interpreter = Interpretation(training_data=X_test[features], training_labels=X_test['case_reversed'], feature_names=features)
im_model = InMemoryModel(xgc.predict_proba, examples=X_train[features], target_names=['not reverted', 'reverted'])
plots = interpreter.feature_importance.plot_feature_importance(im_model, ascending=True, n_samples=1000)
xgc_np = xgb.XGBClassifier(n_estimators=500, map_depth=5, base_score=0.5, objective = 'binary:logistic', random_state=1234)
xgc_np.fit(X_train[features].values, X_train['case_reversed'])
from skater.core.local_interpretation.lime.lime_tabular import LimeTabularExplainer
#exp = LimeTabularExplainer(X_test[features], feature_names= list(X_test[features].columns), discretize_continuous = False, class_names=['not reverted', 'reverted'])
exp = LimeTabularExplainer(X_test[features], feature_names= ['npast', 'npresent', 'Party', 'nlets', 'lmodal', 'year'], discretize_continuous = False, class_names=['not reverted', 'reverted'])
features
import numpy as np
print('Actual Label:', X_test['case_reversed'][0])
print('Predicted Label:', pred[0])
#exp.explain_instance((X_test[features]).iloc[0], xgc_np.predict_proba).show_in_notebook()
exp.explain_instance(data_row = X_train[features].iloc[0], predict_fn=xgc_np.predict_proba).show_in_notebook()
import numpy as np
print('Actual Label:', X_test['case_reversed'][1])
print('Predicted Label:', pred[1])
#exp.explain_instance((X_test[features]).iloc[0], xgc_np.predict_proba).show_in_notebook()
exp.explain_instance(data_row = X_train[features].iloc[1], predict_fn=xgc_np.predict_proba).show_in_notebook()